library(rstudioapi)
library(data.table)
setwd(
dirname(
rstudioapi::callFun(
'getActiveDocumentContext'
)$path
)
)
getwd()
## [1] "F:/AEStrategies/fda/binder"
DF = data.frame(x=c("b","b","b","a","a"),v=rnorm(5))
DF
library(data.table)
DT = data.table(DF)
DT
CARS = data.table(cars)
head(CARS)
tables()
## NAME NROW NCOL MB COLS KEY
## 1: CARS 50 2 0 speed,dist
## 2: DT 5 2 0 x,v
## Total: 0MB
sapply(DT, class)
## x v
## "character" "numeric"
DT
DT[2,]
DT[x=="b",]
cat(try(DT["b",], silent=T)) #no row names. must set key
## Error in `[.data.table`(DT, "b", ) :
## When i is a data.table (or character vector), the columns to join by must be specified using 'on=' argument (see ?data.table), by keying x (i.e. sorted, and, marked as sorted, see ?setkey), or by sharing column names between x and i (i.e., a natural join). Keyed joins might have further speed benefits on very large data due to x being sorted in RAM.
setkey(DT,x)
DT #DT is now sorted by column x
haskey(DT); key(DT); attributes(DT); tables() #4 different ways to confirm DT has a key
## [1] TRUE
## [1] "x"
## $names
## [1] "x" "v"
##
## $row.names
## [1] 1 2 3 4 5
##
## $class
## [1] "data.table" "data.frame"
##
## $.internal.selfref
## <pointer: 0x0000000012a91ef0>
##
## $sorted
## [1] "x"
## NAME NROW NCOL MB COLS KEY
## 1: CARS 50 2 0 speed,dist
## 2: DT 5 2 0 x,v x
## Total: 0MB
DT["b",] #now we can call only "b" values from column 'x' (comma is optional)
DT["b"]
DT["b",mult="first"] #allows just the first row of the group to be returned
DT["b",mult="last"] #allows just the last row of the group to be returned
# Create a new data.frame to illustrate difference between 'vector scan' and 'binary search'
grpsize = ceiling(1e7/26^2) #10 million rows, 676 groups
#first with a data.frame
tt = system.time(DF <- data.frame(
x=rep(LETTERS, each=26*grpsize),
y=rep(letters, each=grpsize),
v=runif(grpsize*26^2),
stringsAsFactors=FALSE))
head(DF,3)
tail(DF,3)
dim(DF)
## [1] 10000068 3
tt=system.time(ans1 <- DF[DF$x=="R" & DF$y=="h",]) # this is a 'vector scan'
head(ans1,3)
tail(ans1,3)
dim(ans1)
## [1] 14793 3
#second with a data.table
DT = as.data.table(DF) # but normally use fread() or data.table() directly, originally
system.time(setkey(DT,x,y)) # one-off cost, usually
## user system elapsed
## 0.26 0.02 0.10
ss=system.time(ans2 <- DT[list("R","h")]) # this is a 'binary search'
head(ans2,3)
tail(ans2,3)
dim(ans2)
## [1] 14793 3
identical(ans1$v, ans2$v)
## [1] TRUE
#vector scanning a data.table is unwise, might as well just use a data.frame
system.time(ans1 <- DT[x=="R" & y=="h",]) # this works, but is silly
## user system elapsed
## 0 0 0
system.time(ans2 <- DF[DF$x=="R" & DF$y=="h",]) # the data.frame way, not much slower
## user system elapsed
## 0.09 0.00 0.09
mapply(identical, ans1, ans2)
## x y v
## TRUE TRUE TRUE
When we used x==“R” we scanned the entire column x, testing each and every value to see if it equalled “R”. We did it again in the y column, testing for “h”. Then & combined the two logical results to create a single logical vector which was passed to the [ method, which in turn searched it for TRUE and returned those rows. These were vectorized operations. They occurred internally in R and were very fast, but they were scans. We did those scans because we wrote that R code.
When i is a list (and data.table is a list too), we say that we are joining. In this case, we are joining DT to the 1 row, 2 column table returned by list(“R”,“h”). Since we do this a lot, there is an alias for list: .().
identical(DT[list("R","h")], DT[.("R","h")])
## [1] TRUE
DT[,sum(v)] # sum(v) is in the 'j' position
## [1] 4999880
DT[,sum(v),by=x] # 'by' indicates which groups element 'j' is the be repeated...by
#this approach is much, much faster than other options
ttt = system.time(tt <- tapply(DT$v,DT$x,sum)); ttt
## user system elapsed
## 0.33 0.01 0.34
sss = system.time(ss <- DT[,sum(v), b=x]); sss
## user system elapsed
## 0.18 0.08 0.06
head(tt)
## A B C D E F
## 192244.7 192080.7 192557.6 192201.6 192175.8 192145.3
head(ss)
identical(as.vector(tt), ss$V1)
## [1] TRUE
#especially when grouping by 2 or more columns
ttt = system.time(tt <- tapply(DT$v,list(DT$x,DT$y),sum)); ttt
## user system elapsed
## 0.56 0.10 0.65
sss = system.time(ss <- DT[,sum(v),by="x,y"]); sss
## user system elapsed
## 0.54 0.01 0.39
tt[1:5,1:5]
## a b c d e
## A 7338.671 7410.092 7377.000 7413.020 7401.229
## B 7358.148 7480.022 7387.543 7387.932 7439.042
## C 7405.975 7468.331 7436.933 7401.604 7389.409
## D 7392.955 7466.628 7396.581 7383.296 7390.394
## E 7402.446 7398.987 7404.454 7358.312 7421.215
head(ss)
identical(as.vector(t(tt)), ss$V1)
## [1] TRUE
#this is also known as 'last observation carried forward' (LOCF), or a 'rolling join'.
flights = fread("https://raw.githubusercontent.com/wiki/arunsrinivasan/flights/NYCflights14/flights14.csv")
head(flights)
dim(flights)
## [1] 253316 17
#subset
ans = flights[origin == "JFK" & month == 6L]
head(ans)
#select rows
ans = flights[1:2]
ans
ans = flights[order(origin, -dest)]
ans
#select column as vector
ans = flights[,arr_delay]
#select column as data.table (.() = list() in data.table)
ans = flights[,.(arr_delay)]
ans
#select multiple columns
ans = flights[,.(arr_delay, dep_delay)]
ans
#rename columns on selection
ans = flights[,.(delay_arr=arr_delay, delay_dep=dep_delay)]
ans
#perform computation on columns
ans = flights[,sum((arr_delay+dep_delay)<0)]
ans
## [1] 141814
#subset first, then compute
ans = flights[origin=="JFK" & month ==6L,.(m_arr=mean(arr_delay),
m_dep=mean(dep_delay))]
ans
ans = flights[origin == "JFK" & month == 6L, length(dest)]
ans
## [1] 8422
#...is equivalent to...
ans = flights[origin == "JFK" & month == 6L, .N]
ans
## [1] 8422
#...or as a data.table
ans = flights[origin == "JFK" & month == 6L, .(.N)]
ans
#select columns by names, like in data.frame
ans = flights[,c("arr_delay", "dep_delay"), with=F]
ans
#select a bunch of columns at once
ans = flights[, year:day, with=FALSE] # everything between 'year' and 'day'
ans
ans = flights[, day:year, with=FALSE] # the same, in reverse order
ans
ans = flights[, -(year:day), with=FALSE] # return everything but 'year' to 'day'
ans
ans = flights[, !(year:day), with=FALSE] # return everything byt 'year' to 'day'
ans
ans = flights[, .(.N), by=.(origin)]
ans
ans = flights[, .(.N), by="origin"]
ans
ans = flights[carrier == "AA", .N, by=origin]
ans
ans = flights[carrier == "AA", .N, by=.(origin,dest)]
head(ans)
ans = flights[carrier == "AA",
.(mean(arr_delay), mean(dep_delay)),
by=.(origin, dest, month)]
ans
ans = flights[carrier == "AA",
.(mean(arr_delay), mean(dep_delay)),
keyby=.(origin, dest, month)] # this orders the result
ans
#we can tack expressions one after another, forming a chain of operations,
#i.e., DT[ ... ][ ... ][ ... ]; this can avoid intermediate steps
ans = flights[carrier == "AA", .N, by=.(origin, dest)][order(origin, -dest)]
head(ans, 10)
ans = flights[, .N, .(dep_delay>0, arr_delay>0)]
ans
#to apply a function to multiple columns
DT = data.table(ID = c("b","b","b","a","a","c"), a = 1:6, b = 7:12, c=13:18)
DT[, lapply(.SD, mean), by=ID]
flights[carrier == "AA", # Only on trips with carrier "AA"
lapply(.SD, mean), # compute the mean
by=.(origin, dest, month), # for every 'origin,dest,month'
.SDcols=c("arr_delay", "dep_delay")] # for just those specified in .SDcols
ans = flights[, head(.SD, 2), by=month]
head(ans)
#flexibility in 'j'
DT[, .(val = c(a,b)), by=ID] # convert from wide to narrow
DT[, .(val = list(c(a,b))), by=ID] # less practically useful
DT[, print(c(a,b)), by=ID]
## [1] 1 2 3 7 8 9
## [1] 4 5 10 11
## [1] 6 12
DT[, print(list(c(a,b))), by=ID]
## [[1]]
## [1] 1 2 3 7 8 9
##
## [[1]]
## [1] 4 5 10 11
##
## [[1]]
## [1] 6 12
The general form of data.table syntax is:
DT[i, j, by]
We have seen so far that,
Using ‘i’: We can subset rows similar to a data.frame - except you don’t have to use DT$ repetitively since columns within the frame of a data.table are seen as if they are variables. We can also sort a data.table using order(), which internally uses data.table’s fast order for performance. We can do much more in i by keying a data.table, which allows blazing fast subsets and joins. We will see this in the “Keys and fast binary search based subsets” and “Joins and rolling joins” vignette. Using ‘j’: Select columns the data.table way: DT[, .(colA, colB)]. Select columns the data.frame way: DT[, c(“colA”, “colB”), with=FALSE]. Compute on columns: DT[, .(sum(colA), mean(colB))]. Provide names if necessary: DT[, .(sA =sum(colA), mB = mean(colB))]. Combine with i: DT[colA > value, sum(colB)]. Using ‘by’: Using by, we can group by columns by specifying a list of columns or a character vector of column names or even expressions. The flexibility of j, combined with by and i makes for a very powerful syntax. by can handle multiple columns and also expressions. We can keyby grouping columns to automatically sort the grouped result. We can use .SD and .SDcols in j to operate on multiple columns using already familiar base functions. Here are some examples:
DT[, lapply(.SD, fun), by=., .SDcols=…] - applies fun to all columns specified in .SDcols while grouping by the columns specified in by. DT[, head(.SD, 2), by=.] - return the first two rows for each group. DT[col > val, head(.SD, 1), by=.] - combine i along with j and by.
set.seed(1L)
DF = data.frame(ID1 = sample(letters[1:2], 10, TRUE),
ID2 = sample(1:3, 10, TRUE),
val = sample(10),
stringsAsFactors = FALSE,
row.names = sample(LETTERS[1:10]))
DF
rownames(DF)
## [1] "I" "D" "G" "A" "B" "E" "C" "J" "F" "H"
DF["C", ]
DT = as.data.table(DF)
DT
rownames(DT)
## [1] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10"
setkey(flights, origin)
head(flights)
flights[.("JFK")]
key(flights)
## [1] "origin"
setkey(flights, origin, dest)
head(flights)
key(flights)
## [1] "origin" "dest"
flights[.("JFK", "MIA")]
flights[.("JFK")] # subset first key for 'JFK'
flights[.(unique(origin), "MIA")] # subset just the second key for 'MIA'
flights[.("LGA", "TPA"), .(arr_delay)]
flights[.("LGA", "TPA"), .(arr_delay)][order(-arr_delay)]
flights[.("LGA", "TPA"), max(arr_delay)]
## [1] 486
flights[, sort(unique(hour))]
## [1] 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
setkey(flights, hour)
key(flights)
## [1] "hour"
flights[.(24), hour := 0L]
key(flights)
## NULL
flights[, sort(unique(hour))]
## [1] 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
setkey(flights, origin, dest)
key(flights)
## [1] "origin" "dest"
ans = flights["JFK", max(dep_delay), keyby=month]
head(ans)
key(ans)
## [1] "month"
flights[.("JFK", "MIA"), mult="first"]
flights[.(c("LGA", "JFK", "EWR"), "XNA"), mult="last"]
flights[.(c("LGA", "JFK", "EWR"), "XNA"), mult="last", nomatch = 0L]
flights[.("JFK", "MIA")]
flights[origin == "JFK" & dest == "MIA"]
set.seed(2L)
N = 2e7L
DT = data.table(x = sample(letters, N, TRUE),
y = sample(1000L, N, TRUE),
val=runif(N), key = c("x", "y"))
print(object.size(DT), units="Mb")
## 381.5 Mb
key(DT)
## [1] "x" "y"
#vector scan approach
t1 <- system.time(ans1 <- DT[x == "g" & y == 877L])
t1
## user system elapsed
## 0 0 0
head(ans1)
dim(ans1)
## [1] 762 3
#using keys
t2 <- system.time(ans2 <- DT[.("g", 877L)])
t2
## user system elapsed
## 0.02 0.00 0.02
head(ans2)
dim(ans2)
## [1] 762 3
identical(ans1$val, ans2$val)
## [1] TRUE
Vector scan approach: The column x is searched for the value “g” row by row, on all 20 million of them. This results in a logical vector of size 20 million, with values TRUE, FALSE or NA corresponding to x’s value. Similarly, the column y is searched for 877 on all 20 million rows one by one, and stored in another logical vector. Element wise & operations are performed on the intermediate logical vectors and all the rows where the expression evaluates to TRUE are returned. This is what we call a vector scan approach. And this is quite inefficient, especially on larger tables and when one needs repeated subsetting, because it has to scan through all the rows each time.
Binary search approach: Here’s a very simple illustration. Let’s consider the (sorted) numbers shown below: 1, 5, 10, 19, 22, 23, 30
Suppose we’d like to find the matching position of the value 1, using binary search, this is how we would proceed - because we know that the data is sorted. Start with the middle value = 19. Is 1 == 19? No. 1 < 19. Since the value we’re looking for is smaller than 19, it should be somewhere before 19. So we can discard the rest of the half that are >= 19. Our set is now reduced to 1, 5, 10. Grab the middle value once again = 5. Is 1 == 5? No. 1 < 5. Our set is reduced to 1. Is 1 == 1? Yes. The corresponding index is also 1. And that’s the only match.
A vector scan approach on the other hand would have to scan through all the values (here, 7).
DF = data.frame(ID = c("b","b","b","a","a","c"), a = 1:6, b = 7:12, c=13:18)
DF
DF$c = 18:13 # replace entire column
DF$c[DF$ID == "b"] = 15:13
DF
#add columns by reference (speed and total delay of each flight)
flights[, `:=`(speed = distance / (air_time/60), # speed in km/hr
delay = arr_delay + dep_delay)] # delay in minutes
head(flights)
flights[, c("speed", "delay") := list(distance/(air_time/60), arr_delay + dep_delay)]
flights[, sort(unique(hour))]
## [1] 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
flights[hour == 24L, hour := 0L]
flights[, c("delay") := NULL] # delete column by reference
head(flights)
flights[, `:=`(delay = NULL)]
## Warning in `[.data.table`(flights, , `:=`(delay = NULL)): Column 'delay' does
## not exist to remove
flights[, max_speed := max(speed), by=.(origin, dest)]
head(flights)
#mutliple columns using ':='
in_cols = c("dep_delay", "arr_delay")
out_cols = c("max_dep_delay", "max_arr_delay")
flights[, c(out_cols) := lapply(.SD, max), by = month, .SDcols = in_cols]
head(flights)